from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def text_splitter(path):
    """加载文本文件并分割文本。"""
    loader = TextLoader(path, encoding="utf-8")

    # 定义更合理的分隔符列表
    separators = ["\n\n", "\n", "。", "？", "！", "，", ".", "?", "!", ","]

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=50,  # 指定每个文本块的目标大小
        chunk_overlap=10,  # 指定文本块之间的重叠字符数
        length_function=len,  # 用于测量文本长度的函数
        is_separator_regex=False,  # 分隔符是否视为正则表达式
        separators=separators  # 分割文本的分隔符列表
    )

    documents = text_splitter.split_documents(loader.load())
    return documents

# def pdf_splitter(path):